# -*-coding:utf-8 -*-

import os
import pandas as pd
import json

folder_path = "/home/ubuntu/code/git/subject-word-extraction/data/input/"

res_df = pd.DataFrame()
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.lower().split(".")[-1] == "json":
            try:
                file_path = os.path.join(folder_path, file)
                with open(file_path, "r", encoding="gbk") as f:
                    data = json.load(f)
                    df = pd.DataFrame(data)

                # df = pd.read_csv(file_path, encoding="utf-8", dtype={"SecName":str, "SecName.1":str,"SecCode":str})
                # df["SecCode"] = df["SecCode"].apply(lambda x:str(int(x)).zfill(6))
                res_df = pd.concat((res_df, df), axis=0)
            except Exception as e:
                pass

res_df = res_df.drop_duplicates(subset=["AdjunctUrl"])
res_df["Year"] = res_df["AdjunctUrl"].apply(lambda x:x.split("/")[1].split("-")[0])
res_df.to_csv("/home/ubuntu/code/git/subject-word-extraction/data/out/res.csv", index=False)

# res_df = pd.read_csv("/home/ubuntu/code/git/subject-word-extraction/data/out/res.csv", dtype={"CorpCode":str,"Year":str})

# df = pd.read_excel("/home/ubuntu/code/git/subject-word-extraction/data/in/buchong.xlsx", dtype={"CorpCode":str,"Year":str})
# df = df.rename(columns={"CorpCode":"SecCode"})
# df.head()
